This data was sourced from a study conducted by Boudreault et al, cited below. Alternative splicing (AS) is a method of gene regulation in some Eukaryotes which modifies the sequence of RNA transcripts. Alternative Splicing changes the composition of the proteins resulting from the RNA transcripts through differential choice of exons included in mature mRNAs. This results in higher variability and diversity in the cellular proteome.The study from which this dataset is lifted looks into the global changes in RNA splicing that occurs after infection with a human virus.
Study Citation: Boudreault S, Martenon-Brodeur C, Caron M, Garant JM, Tremblay MP, Armero VE, Durand M, Lapointe E, Thibault P, Tremblay-Létourneau M, Perreault JP, Scott MS, Lemay G, Bisaillon M. Global Profiling of the Cellular Alternative RNA Splicing Landscape during Virus-Host Interactions. PLoS One. 2016 Sep 6;11(9):e0161914.doi: 10.1371/journal.pone.0161914. PMID: 27598998; PMCID: PMC5012649. ________________________________________________________________________________
This data analysis uses additional packages beyond base R, which includes the package manager pacman.
knitr::opts_chunk$set(echo = TRUE,
# warning = FALSE,
# message = FALSE,n
fig.align = "center",
fig.height = 8)
#install.packages('pacman')
#install.packages(c("BiocManager", "RColorBrewer", "tidyverse", "devtools", "pheatmap", "gprofiler2"))
#install.packages("genekitr")
#install.packages("biomaRt")
#BiocManager::install(c("DESeq2", "org.Hs.eg.db", "EnhancedVolcano", "hypeR"))
pacman::p_load(dplyr, ggplot2, DESeq2, tidyverse, forcats,
readr, usedist, gridExtra, DESeq2, RColorBrewer, pheatmap,
genekitr, clusterProfiler, EnhancedVolcano, ggfortify, biomartr,
hypeR, gprofiler2, BiocManager)
#BiocManager::install(c("DESeq2", "org.Hs.eg.db", "EnhancedVolcano", "hypeR", "biomaRt"))
theme_set(theme_bw())
This is reading in the metadata into a data frame and doing some preliminary data cleaning, which includes separating values and using the janitor function to clean up the metadata.
meta <- read.table(file = './Data/SRP074247/metadata_SRP074247.tsv',
sep = '\t',
header = TRUE)
janitor::clean_names(meta)
## refinebio_accession_code experiment_accession refinebio_age
## 1 SRR3471108 SRP074247 NA
## 2 SRR3471109 SRP074247 NA
## 3 SRR3471110 SRP074247 NA
## 4 SRR3471111 SRP074247 NA
## 5 SRR3471112 SRP074247 NA
## 6 SRR3471113 SRP074247 NA
## 7 SRR3471114 SRP074247 NA
## 8 SRR3471115 SRP074247 NA
## 9 SRR3471116 SRP074247 NA
## refinebio_cell_line refinebio_compound refinebio_developmental_stage
## 1 l929 NA NA
## 2 l929 NA NA
## 3 l929 NA NA
## 4 l929 NA NA
## 5 l929 NA NA
## 6 l929 NA NA
## 7 l929 NA NA
## 8 l929 NA NA
## 9 l929 NA NA
## refinebio_disease refinebio_disease_stage refinebio_genetic_information
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## 7 NA NA NA
## 8 NA NA NA
## 9 NA NA NA
## refinebio_organism refinebio_platform
## 1 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 2 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 3 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 4 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 5 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 6 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 7 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 8 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 9 MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## refinebio_processed refinebio_processor_id refinebio_processor_name
## 1 True 381 Tximport
## 2 True 381 Tximport
## 3 True 381 Tximport
## 4 True 381 Tximport
## 5 True 381 Tximport
## 6 True 381 Tximport
## 7 True 381 Tximport
## 8 True 381 Tximport
## 9 True 381 Tximport
## refinebio_processor_version refinebio_race refinebio_sex
## 1 v1.25.8-hotfix NA NA
## 2 v1.25.8-hotfix NA NA
## 3 v1.25.8-hotfix NA NA
## 4 v1.25.8-hotfix NA NA
## 5 v1.25.8-hotfix NA NA
## 6 v1.25.8-hotfix NA NA
## 7 v1.25.8-hotfix NA NA
## 8 v1.25.8-hotfix NA NA
## 9 v1.25.8-hotfix NA NA
## refinebio_source_archive_url refinebio_source_database
## 1 NA SRA
## 2 NA SRA
## 3 NA SRA
## 4 NA SRA
## 5 NA SRA
## 6 NA SRA
## 7 NA SRA
## 8 NA SRA
## 9 NA SRA
## refinebio_specimen_part refinebio_subject
## 1 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## 2 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## 3 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## 4 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## 5 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## 6 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## 7 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## 8 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## 9 c3h/an mouse conjunctive tissue (fibroblasts) l929 cell line
## refinebio_time refinebio_title
## 1 NA Mock Cells, Replicate 1
## 2 NA Mock Cells, Replicate 2
## 3 NA Mock Cells, Replicate 3
## 4 NA Cells infected with Reovirus, replicate 1
## 5 NA Cells infected with Reovirus, replicate 2
## 6 NA Cells infected with Reovirus, replicate 3
## 7 NA Cells infected with Mutant Reovirus, replicate 1
## 8 NA Cells infected with Mutant Reovirus, replicate 2
## 9 NA Cells infected with Mutant Reovirus, replicate 3
## refinebio_treatment meta_sra_age
## 1 NA 100
## 2 NA 100
## 3 NA 100
## 4 NA 100
## 5 NA 100
## 6 NA 100
## 7 NA 100
## 8 NA 100
## 9 NA 100
It is important to clean the metadata so the data analysis is accurate and streamlined. meta_clean is a cleaned version of the meta data frame. The treatment variables we are interested in are Mock, Reovirus, and Mutant. The names in meta were cleaned to only include the treatment type and the replicate number. First the treatment type and replicate numbers are separated out into their own columns, and then recombined for a cleaner view of what each column is telling us.
meta_clean <-
meta |>
# Changing columns to choose based on how our dataset is stored
mutate('sample_ID' = refinebio_accession_code,
'title' = refinebio_title,
'treatment' = refinebio_title,
'replicate' = refinebio_title,
)
for(i in 1:nrow(meta_clean)){
if(grepl('Mock', meta_clean$title[i])){
meta_clean$treatment[i] <- 'uninfected'}
if(grepl('with Reovirus', meta_clean$title[i])){
meta_clean$treatment[i] <- 'infected'}
if(grepl('Mutant', meta_clean$title[i])){
meta_clean$treatment[i] <- 'mutant'}
}
for(i in 1:nrow(meta_clean)){
if(grepl('1', meta_clean$title[i])){
meta_clean$replicate[i] <- '1'}
if(grepl('2', meta_clean$title[i])){
meta_clean$replicate[i] <- '2'}
if(grepl('3', meta_clean$title[i])){
meta_clean$replicate[i] <- '3'}
}
meta_clean <-
meta_clean |>
filter(treatment %in% c('uninfected', 'infected')) |>
mutate('sample' = paste(treatment, replicate, sep = "_")) |>
select(sample_ID, sample, title, treatment, replicate) |>
janitor::clean_names()
meta_clean
## sample_id sample title treatment
## 1 SRR3471108 uninfected_1 Mock Cells, Replicate 1 uninfected
## 2 SRR3471109 uninfected_2 Mock Cells, Replicate 2 uninfected
## 3 SRR3471110 uninfected_3 Mock Cells, Replicate 3 uninfected
## 4 SRR3471111 infected_1 Cells infected with Reovirus, replicate 1 infected
## 5 SRR3471112 infected_2 Cells infected with Reovirus, replicate 2 infected
## 6 SRR3471113 infected_3 Cells infected with Reovirus, replicate 3 infected
## replicate
## 1 1
## 2 2
## 3 3
## 4 1
## 5 2
## 6 3
The counts matrix is a data frame which contains the actual data output from the RNA-seq experiment, and will be used for the differential gene expression analysis.
Reading in the counts matrix
# Read in counts matrix
data <- read.table(file = './Data/SRP074247/SRP074247.tsv',
sep = '\t',
header = TRUE,
row.names = 1) |>
select(-c(SRR3471114, SRR3471115, SRR3471116))
head(data)
## SRR3471108 SRR3471109 SRR3471110 SRR3471111
## ENSMUSG00000000001 19889.8320 19485.824000 15567.427000 16893.19700
## ENSMUSG00000000003 0.0000 0.000000 0.000000 0.00000
## ENSMUSG00000000028 966.2573 1009.518100 790.992900 544.20966
## ENSMUSG00000000031 0.0000 0.000000 0.000000 0.00000
## ENSMUSG00000000037 186.0445 196.689480 166.312350 73.05573
## ENSMUSG00000000049 0.0000 1.311085 0.703601 4.89068
## SRR3471112 SRR3471113
## ENSMUSG00000000001 16797.210000 13591.75300
## ENSMUSG00000000003 0.000000 0.00000
## ENSMUSG00000000028 695.027340 492.18677
## ENSMUSG00000000031 0.000000 0.00000
## ENSMUSG00000000037 78.764824 76.60153
## ENSMUSG00000000049 1.354603 0.00000
identical(colnames(data), meta_clean$sample_id)
## [1] TRUE
This boolean flag checks that the cleaned data matches the original data, which it does.
This is renaming the columns created above for treatment type (infected or uninfected) and replicate number on the counts matrix
colnames(data) <- paste(meta_clean$treatment, meta_clean$replicate, sep = "_")
tibble(data)
## # A tibble: 41,249 × 6
## uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2 infected_3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 19890. 19486. 15567. 16893. 16797. 13592.
## 2 0 0 0 0 0 0
## 3 966. 1010. 791. 544. 695. 492.
## 4 0 0 0 0 0 0
## 5 186. 197. 166. 73.1 78.8 76.6
## 6 0 1.31 0.704 4.89 1.35 0
## 7 831. 830. 617. 254. 213. 181.
## 8 3131. 3077. 2470. 2406. 2472. 2053.
## 9 5645. 5921. 4623. 18342. 19434. 15835.
## 10 714. 843. 781. 675. 706. 594.
## # ℹ 41,239 more rows
This is adding the gene name to each row
data_clean <- rownames_to_column(data, 'gene')
tibble(data_clean)
## # A tibble: 41,249 × 7
## gene uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2 infected_3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSM… 19890. 19486. 15567. 16893. 16797. 13592.
## 2 ENSM… 0 0 0 0 0 0
## 3 ENSM… 966. 1010. 791. 544. 695. 492.
## 4 ENSM… 0 0 0 0 0 0
## 5 ENSM… 186. 197. 166. 73.1 78.8 76.6
## 6 ENSM… 0 1.31 0.704 4.89 1.35 0
## 7 ENSM… 831. 830. 617. 254. 213. 181.
## 8 ENSM… 3131. 3077. 2470. 2406. 2472. 2053.
## 9 ENSM… 5645. 5921. 4623. 18342. 19434. 15835.
## 10 ENSM… 714. 843. 781. 675. 706. 594.
## # ℹ 41,239 more rows
Initially, the read matrix had one row for every gene, and each column was the read count for each treatment and replicate. By pivoting the data, we now are able to have more than one row for each gene (there are now nine rows for each gene, as there are nine sample treatment types), and the columns are divided by sample, treatment, replicate, gene, and reads. This way, the data is easier to assess and the data frame contains more information than just the read counts.
data_long <-
pivot_longer(data = data_clean,
cols = -gene,
names_to = 'ID',
values_to = 'reads') |>
mutate('sample' = ID) |>
separate(col = 'ID',
into = c('treatment', 'replicate'),
sep = '_') |>
mutate(treatment = as.factor(treatment),
replicate = as.factor(replicate)) |>
select(sample, treatment, replicate, gene, reads)
tibble(data_long)
## # A tibble: 247,494 × 5
## sample treatment replicate gene reads
## <chr> <fct> <fct> <chr> <dbl>
## 1 uninfected_1 uninfected 1 ENSMUSG00000000001 19890.
## 2 uninfected_2 uninfected 2 ENSMUSG00000000001 19486.
## 3 uninfected_3 uninfected 3 ENSMUSG00000000001 15567.
## 4 infected_1 infected 1 ENSMUSG00000000001 16893.
## 5 infected_2 infected 2 ENSMUSG00000000001 16797.
## 6 infected_3 infected 3 ENSMUSG00000000001 13592.
## 7 uninfected_1 uninfected 1 ENSMUSG00000000003 0
## 8 uninfected_2 uninfected 2 ENSMUSG00000000003 0
## 9 uninfected_3 uninfected 3 ENSMUSG00000000003 0
## 10 infected_1 infected 1 ENSMUSG00000000003 0
## # ℹ 247,484 more rows
colors <- c('infected' = '#FA8633',
'uninfected' = 'lightseagreen')
For gene expression profiling experiments, we hope for between 10-25 million reads per sample. Below is a bar plot showing the sum of the total reads for each sample
We must find if there is a significant different in the total counts by treatment
sum_by_sample <-
data_long |>
group_by(treatment, sample) |>
summarize('sum' = sum(reads)) |>
ungroup()
## `summarise()` has grouped output by 'treatment'. You can override using the
## `.groups` argument.
ggplot(data = data_long,
mapping = aes(x = sample)) +
geom_col(data = sum_by_sample,
mapping = aes(y = sum,
fill = treatment),
color = 'black') +
labs(x = 'Sample',
y = 'Total Reads',
title = 'Assessing the quality of the dataset using number of reads',
caption = 'Data from refine.bio\nAccession ID: SRP074247',
fill = 'Sample type') +
theme(plot.title = element_text(hjust = 0.5,
size = 16),
axis.text.x = element_text(angle = 90,
vjust = 0.5,
hjust = 1),
axis.title.y = element_text(size = 14),
axis.title.x = element_text(size = 14),
legend.position = 'right',
plot.caption = element_text(face = 'italic')) +
scale_fill_manual(values = colors) +
scale_y_continuous(expand = c(0.01, 0))
Figure 1. A barplot assessing the quality of the dataset
used. For each gene expression profiling experiment, a good
quality dataset will have 10-25 million reads per sample. As shown in
the plot, all six samples are within the range desired.
ggplot(data = sum_by_sample |> group_by(treatment) |> summarize('sum' = sum(sum)),
mapping = aes(x = sum,
y = treatment,
fill = treatment)) +
geom_boxplot(data = data_long |> group_by(reads),
mapping = aes(x = log2(reads))) +
scale_fill_manual(values = colors) +
labs(title = 'Number of reads by sample type',
x = 'log2(reads)',
y = NULL,
fill = 'Sample type') +
theme(plot.title = element_text(hjust = 0.5,
size = 16))
## Warning: Removed 91153 rows containing non-finite values (`stat_boxplot()`).
Figure 2. A Boxplot of the number of reads by sample
type. It is important to analyze the read counts between
infected vs uninfected because if there are statistical differences,
this will impact downstream biological interpretation.
We are conducting a T-Test to evaluate whether or not there is a statistical difference in counts by treatment. We are hoping for a high p-value which will indicate no statistical significance in the difference in read counts by treatment.
Rather than conducting a T-test to establish statistical significance, we will use Analysis of variance (ANOVA). This is because we have 3 treatment variables, and a T test is only able to analyze significance between 2 variables.
t.test(colSums(data) ~ meta_clean$treatment)
##
## Welch Two Sample t-test
##
## data: colSums(data) by meta_clean$treatment
## t = 1.2091, df = 3.9817, p-value = 0.2935
## alternative hypothesis: true difference in means between group infected and group uninfected is not equal to 0
## 95 percent confidence interval:
## -3614589 9173758
## sample estimates:
## mean in group infected mean in group uninfected
## 31201792 28422207
The p-value is .2935, so there is not a significant difference between total number of counts by treatment group.
sum_by_gene <-
data_long |>
group_by(gene) |>
summarize('sum' = sum(reads)) |>
ungroup()
ggplot(data = sum_by_gene,
mapping = aes(x = log2(sum))) +
geom_histogram(bins = 15,
color = 'black',
fill = '#9A6DA3') +
labs(x = 'Log-2 transformed counts per gene',
y = 'Number of genes',
title = 'Number of reads by gene') +
theme(plot.title = element_text(hjust = 0.5,
size = 16))
## Warning: Removed 9696 rows containing non-finite values (`stat_bin()`).
Figure 3. A histogram of the sum number of reads across each
gene. This histogram is including low-abundance gene. There are
six samples worth of read counts included in this histogram.
high_abun_genes <-
data_long |>
pivot_wider(names_from = gene,
values_from = reads) |>
select(where(~ all(. != 0)))
high_abun_long <-
high_abun_genes |>
pivot_longer(cols = -c(sample, treatment, replicate),
names_to = 'gene',
values_to = 'reads')
high_abun_sum <-
high_abun_long |>
group_by(gene) |>
summarize('sum' = sum(reads)) |>
ungroup()
ggplot(data = high_abun_sum,
mapping = aes(x = log2(sum))) +
geom_histogram(bins = 15,
color = 'black',
fill = '#9A6DA3') +
labs(x = 'Log-2 transformed counts per gene',
y = 'Number of genes',
title = 'Number of reads by gene (excluding low abundance genes)') +
theme(plot.title = element_text(hjust = 0.5,
size = 16))
Figure 4. A histogram of the sum number of reads across each
gene. This histogram is excluding low abundance genes, or genes
with read counts of zero. There are six samples worth of read counts
included in this histogram.
non_normal_graph <-
ggplot(data = high_abun_long,
mapping = aes(x = sample)) +
geom_boxplot(mapping = aes(y = log2(reads),
fill = treatment),
color = 'black') +
labs(x = 'Sample ID',
y = 'Log 2 transformed counts per gene',
title = 'Non-normalized counts',
caption = 'Data from refine.bio\nAccession ID: SRP074247',
fill = 'Sample type') +
theme(plot.title = element_text(hjust = 0.5,
size = 16),
axis.text.x = element_text(angle = 90,
vjust = 0.5,
hjust = 1),
axis.title.y = element_text(size = 14),
axis.title.x = element_text(size = 14),
legend.position = 'right',
plot.caption = element_text(face = 'italic')) +
scale_fill_manual(values = colors) +
scale_y_continuous(expand = c(0.01, 0))
non_normal_graph
Figure 5. A non-normalized boxplot of the sum number of reads
across each gene. The boxplot provides insight into the
distribution of read counts per sample.
sample_names <- c('uninfected_1', 'uninfected_2', 'uninfected_3', 'infected_1', 'infected_2', 'infected_3')
high_abun_log2 <-
high_abun_long |>
mutate(reads = log2(reads)) |>
pivot_wider(names_from = gene,
values_from = reads)
high_abun_log2_flipped <-
data.frame(t(high_abun_log2 |> select(-c(treatment, replicate)))) |> janitor::row_to_names(row_number = 1) |>
mutate_all(function(x) as.numeric(as.character(x)))
data_dist <- dist(high_abun_log2, method = "euclidean")
## Warning in dist(high_abun_log2, method = "euclidean"): NAs introduced by
## coercion
data_dist <- dist_setNames(data_dist, sample_names)
plot(hclust(data_dist, method = "complete"), xlab = "Sample", main = NULL)
Figure 6. A dendrogram comparing the similarity of read count
data across the different samples based on non-normalized data.
The three infected samples are most similar to one another,
and the three uninfected samples are most similar to one another, which
is to be expected.
Generating correction factors for the data where the median of each row is subtracted from the mean of the row. We then apply this correction factor to each sample.
sample_medians <- (apply(high_abun_log2_flipped, 2, median))
grand_median <- mean(sample_medians)
correction_factors <- grand_median - sample_medians
corrections <- data.frame(correction_factors)
corrections
## correction_factors
## uninfected_1 -0.1308265
## uninfected_2 -0.1311785
## uninfected_3 0.1408655
## infected_1 -0.0419730
## infected_2 -0.0345170
## infected_3 0.1976295
norm_data <- high_abun_log2_flipped
for(i in 1:ncol(norm_data)){
norm_data[,i] <- high_abun_log2_flipped[,i] + corrections$correction_factors[i]
}
norm_data_long <-
norm_data |> rownames_to_column('gene') |>
pivot_longer(cols = -gene,
names_to = 'ID',
values_to = 'normalized_reads') |>
mutate('sample' = ID) |>
separate(col = 'ID',
into = c('treatment', 'replicate'),
sep = '_') |>
mutate(treatment = as.factor(treatment),
replicate = as.factor(replicate)) |>
select(sample, treatment, replicate, gene, normalized_reads)
normal_graph <-
ggplot(data = norm_data_long,
mapping = aes(x = sample)) +
geom_boxplot(mapping = aes(y = normalized_reads,
fill = treatment),
color = 'black') +
labs(x = 'Sample ID',
y = 'Log 2 transformed counts per gene',
title = 'Normalized counts',
caption = 'Data from refine.bio\nAccession ID: SRP074247',
fill = 'Sample type') +
theme(plot.title = element_text(hjust = 0.5,
size = 16),
axis.text.x = element_text(angle = 90,
vjust = 0.5,
hjust = 1),
axis.title.y = element_text(size = 14),
axis.title.x = element_text(size = 14),
legend.position = 'right',
plot.caption = element_text(face = 'italic')) +
scale_fill_manual(values = colors) +
scale_y_continuous(expand = c(0.01, 0))
normal_graph
Figure 7. A normalized boxplot of the sum number of reads across
each gene. The boxplot provides insight into the distribution
of read counts per sample.
grid.arrange(non_normal_graph, normal_graph, ncol=2)
Figure 8. A side by side comparison of the non-normalized vs
normalized counts boxplots, to see what difference the normalization
made in the figure outcome.
norm_log2 <-
norm_data_long |>
mutate(normalized_reads = log2(normalized_reads)) |>
pivot_wider(names_from = gene,
values_from = normalized_reads)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `normalized_reads = log2(normalized_reads)`.
## Caused by warning:
## ! NaNs produced
norm_dist <- dist(norm_log2, method = "euclidean")
## Warning in dist(norm_log2, method = "euclidean"): NAs introduced by coercion
norm_dist <- dist_setNames(norm_dist, sample_names)
plot(hclust(norm_dist, method = "complete"), xlab = "Sample", main = NULL)
Figure 9. A dendrogram comparing the similarity of read count
data across the different samples based on normalized data.
The three infected samples are most similar to one another, and the
three uninfected samples are most similar to one another, which is to be
expected.
Principle Component Analysis is a technique for analyzing large datasets with multiple features to be considered. By using PCA, one is able to maximize interpretability while maintaining the maximum amount of information. T
PCA <- prcomp(t(norm_dist))
autoplot(PCA, data = data_long, color = 'treatment', main = "Colored by Treatment", size = 3)
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
Figure 10. PCA which colors by treatment. This PCA1
gives a percentage of variance of 42.59%, which indicates 42.59% of the
variance in the data can be explained based on this variable, and PCA2
gives a percentage of variance of 19.93%. These variables are the two
strongest sources of variance in the sample set.
data_round <-
round(data.matrix(data))
#Make sure order is correct
head(data_round,2)
## uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2
## ENSMUSG00000000001 19890 19486 15567 16893 16797
## ENSMUSG00000000003 0 0 0 0 0
## infected_3
## ENSMUSG00000000001 13592
## ENSMUSG00000000003 0
meta_clean[, c('sample')]
## [1] "uninfected_1" "uninfected_2" "uninfected_3" "infected_1" "infected_2"
## [6] "infected_3"
dds <- DESeqDataSetFromMatrix(countData = data_round,
colData = meta_clean,
design = ~ treatment)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
By pre-filtering the DESeq2 functions, we are able to eliminate rows with very few reads and therefore reduce the memory used by the dds object. This improves speed and visualization quality.
# pre-filter to remove low-read rows
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
We must choose a reference level for factors based on treatment type.
dds$treatment <- relevel(dds$treatment, ref = "uninfected")
# run Differential Expression Analysis
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
head(res)
## log2 fold change (MLE): treatment infected vs uninfected
## Wald test p-value: treatment infected vs uninfected
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000000001 16796.973 -0.185684 0.0467589 -3.97109 7.15435e-05
## ENSMUSG00000000028 737.681 -0.647278 0.1004754 -6.44215 1.17792e-10
## ENSMUSG00000000037 127.985 -1.227260 0.1929378 -6.36091 2.00559e-10
## ENSMUSG00000000056 477.308 -1.780200 0.1182042 -15.06039 2.95050e-51
## ENSMUSG00000000058 2565.718 -0.291981 0.0559741 -5.21637 1.82468e-07
## ENSMUSG00000000078 11555.826 1.759546 0.0458610 38.36696 0.00000e+00
## padj
## <numeric>
## ENSMUSG00000000001 2.38709e-04
## ENSMUSG00000000028 6.52838e-10
## ENSMUSG00000000037 1.09498e-09
## ENSMUSG00000000056 7.59753e-50
## ENSMUSG00000000058 7.89931e-07
## ENSMUSG00000000078 0.00000e+00
print('Dimensions:')
## [1] "Dimensions:"
dim(res)
## [1] 23799 6
vsd <- vst(dds, blind=FALSE)
plotPCA(vsd, intgroup=c("treatment")) + theme_classic() + scale_color_manual(values = colors)
## using ntop=500 top features by variance
Figure 11. Principal Component Analysis using variance
stabilizing transformation (vst). This data is transformed on a
log2 scale and then normalized with respect of library size. The
transformation is not blind because the experiment design has
a large difference in counts between the treatment types, and blind
dispersion estimates wil, overly shrink the values towards one another.
This plot demonstrates a PCA based on the first two principle components
of the data.
rld <- rlog(dds, blind=FALSE)
rld_mat <- assay(rld)
rld_cor <- cor(rld_mat)
heat.colors <- brewer.pal(6, "Blues")
pheatmap(rld_cor, color = heat.colors,
breaks = c(0.97, 0.99, 0.995, 0.999, 0.9995, 0.99995))
Figure 12. A hierarchical clustering heatmap of the similarities
between the read counts in each sample. This heatmap is
comparing the similarity of each sample to each other sample, and the
darker blue the sample is, the more similar the two are to one another.
The uninfected samples are most similar to one another, with
uninfected_1 and uninfected_2 being the most similar to each other. The
infected samples are each similarly different to one another, which is
indicated in the lightest blue shade.
infected_n_res <- results(dds, contrast = c("treatment", "infected", "uninfected"))
infected_n_sigs <- na.omit(infected_n_res)
infected_n_sigs <- subset(infected_n_sigs, padj < 0.05 & abs(log2FoldChange) > 1)
infected_n_sig_data <- merge(data.frame(infected_n_sigs),
data.frame(counts(dds, normalized = TRUE)),
by = "row.names", sort=FALSE)
names(infected_n_sig_data)[1] <- "Ensembl_ID"
infected_n_res_data <- merge(data.frame(infected_n_res),
data.frame(counts(dds, normalized = TRUE)),
by="row.names", sort=FALSE)
names(infected_n_res_data)[1] <- "Ensembl_ID"
head(res)
## log2 fold change (MLE): treatment infected vs uninfected
## Wald test p-value: treatment infected vs uninfected
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000000001 16796.973 -0.185684 0.0467589 -3.97109 7.15435e-05
## ENSMUSG00000000028 737.681 -0.647278 0.1004754 -6.44215 1.17792e-10
## ENSMUSG00000000037 127.985 -1.227260 0.1929378 -6.36091 2.00559e-10
## ENSMUSG00000000056 477.308 -1.780200 0.1182042 -15.06039 2.95050e-51
## ENSMUSG00000000058 2565.718 -0.291981 0.0559741 -5.21637 1.82468e-07
## ENSMUSG00000000078 11555.826 1.759546 0.0458610 38.36696 0.00000e+00
## padj
## <numeric>
## ENSMUSG00000000001 2.38709e-04
## ENSMUSG00000000028 6.52838e-10
## ENSMUSG00000000037 1.09498e-09
## ENSMUSG00000000056 7.59753e-50
## ENSMUSG00000000058 7.89931e-07
## ENSMUSG00000000078 0.00000e+00
We are converting gene IDs from Ensemble ID to symbols for a clearer picture
ids <- as.character(infected_n_res_data$Ensembl_ID)
gene_list <- biomaRt::getBM(filter = 'ensembl_gene_id',
attributes = c("ensembl_gene_id","uniprot_gn_symbol"),
values = ids,
mart = biomaRt::useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl"))
Adding the new symbols to the dataset
infected_n_res_data <- merge(infected_n_res_data, gene_list, by.x="Ensembl_ID", by.y="ensembl_gene_id")
infected_n_sig_data <- merge(infected_n_sig_data, gene_list, by.x="Ensembl_ID", by.y="ensembl_gene_id")
#output tabular files
write.csv(infected_n_res_data,
quote = FALSE,
file = "infected_vs_uninfected_normalized_matrix.csv")
#infected_n_sig_data <- subset(infected_n_res_data_gene,
# padj < 0.05 & abs(log2FoldChange) > 1)
write.csv(infected_n_sig_data,
quote = FALSE,
file = "infected_vs_uninfected_deg_padj0.05_log2fc1.csv")
EnhancedVolcano(infected_n_res_data,
lab = as.character(infected_n_res_data$uniprot_gn_symbol),
x = 'log2FoldChange',
y = 'padj',
title = "Infected Versus Uninfected Controls",
subtitle = "Differential expression",
caption = paste0("Upregulated = 330, Downregulated = 222"),
xlim = c(-10,10),
ylim = c(0,20),
FCcutoff = 1,
pCutoff = 0.05,
labSize = 4,
axisLabSize = 12,
col=c('grey40', 'grey40', 'grey40', '#FA8633'),
legendLabels=c('Not sig.','Log2FC','padj', 'padj & Log2FC'),
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 5.0,
gridlines.major = FALSE,
gridlines.minor = FALSE)
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
Figure 13. A Volcano plot of the infected samples vs uninfected
samples. Log transformed adjusted p-values are plotted on the
y-axis while the log2 fold change is plotted on the x-axis.
# plotting upregulated gene
d <- plotCounts(dds, gene="ENSMUSG00000024678", intgroup="treatment", returnData=TRUE)
# Plotting the MOV10 normalized counts, using the samplenames (rownames of d as labels)
ggplot(d, aes(x = treatment, y = count, color = treatment)) +
geom_point(position=position_jitter(w = 0.1,h = 0),
size = 2.5) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = colors) +
labs(title = "Ms4a4d",
x = 'Treatment',
y = 'Count')
Figure 14. Demonstration of up-regulation using Ms4a4d
gene. There is much higher gene expression in the infected
samples compared to the uninfected sample, indicating this gene’s
expression is impacted by virus-host interactions.
# plotting downregulated gene
d <- plotCounts(dds, gene="ENSMUSG00000056328", intgroup="treatment", returnData=TRUE)
# Plotting the MOV10 normalized counts, using the samplenames (rownames of d as labels)
ggplot(d, aes(x = treatment, y = count, color = treatment)) +
geom_point(position=position_jitter(w = 0.1,h = 0),
size = 2.5) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = colors) +
labs(title = "Myh1",
x = 'Treatment',
y = 'Count')
Figure 14. Demonstration of down-regulation using Myh1
gene. There is much higher gene expression in the uninfected
samples compared to the uninfected sample, indicating this gene’s
expression is impacted by virus-host interactions.
Gathering the top 20 differentially expressed genes in the dataset.
## Order results by padj values
top20_sigOE_genes <- infected_n_res_data |>
arrange(padj) |> #Arrange rows by padj values
pull(uniprot_gn_symbol) |> #Extract character vector of ordered genes
head(n=20) #Extract the first 20 genes
## normalized counts for top 20 significant genes
top20_sigOE_norm <- infected_n_sig_data |>
filter(uniprot_gn_symbol %in% top20_sigOE_genes)
# Gathering the columns to have normalized counts to a single column
gathered_top20_sigOE <- top20_sigOE_norm |>
gather(colnames(top20_sigOE_norm)[8:13], key = "sample", value = "normalized_counts")
## check the column header in the "gathered" data frame
head(gathered_top20_sigOE)
## Ensembl_ID baseMean log2FoldChange lfcSE stat pvalue padj
## 1 ENSMUSG00000000078 11555.826 1.759546 0.04586097 38.36696 0 0
## 2 ENSMUSG00000000275 12527.828 3.706646 0.07581458 48.89094 0 0
## 3 ENSMUSG00000000386 15980.147 10.206198 0.16649298 61.30107 0 0
## 4 ENSMUSG00000001123 3666.440 3.923118 0.06682685 58.70572 0 0
## 5 ENSMUSG00000002307 4991.193 3.812616 0.08460128 45.06570 0 0
## 6 ENSMUSG00000002325 1925.976 3.598447 0.07621423 47.21490 0 0
## uniprot_gn_symbol sample normalized_counts
## 1 Klf6 uninfected_1 5252.72999
## 2 Trim25 uninfected_1 1903.82384
## 3 Mx1 uninfected_1 37.22041
## 4 Lgals9 uninfected_1 443.85336
## 5 Daxx uninfected_1 551.79254
## 6 Irf9 uninfected_1 303.34632
gathered_top20_sigOE <- inner_join(meta_clean, gathered_top20_sigOE)
## Joining with `by = join_by(sample)`
## plot using ggplot2
ggplot(data = gathered_top20_sigOE,
mapping = aes(x = as.character(uniprot_gn_symbol), y = normalized_counts, color = treatment)) +
geom_point() +
scale_y_log10() +
xlab("Gene") +
ylab("log10 Normalized Counts") +
ggtitle("Top 20 Significant Differentially Expressed Genes") +
theme_bw() +
scale_color_manual(values = colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5))
Figure 15. Plot of the normalized read counts of the top 20 most
significant differentially expressed genes in the dataset.
To determine whether certain items are over or underrepresented, we are using a hypergeometric test to determine the probability of having our observed proportion of genes. The KEGG database is used to compare the data against a manually curated database.
KEGG <- msigdb_gsets(species="Mus musculus",
category="C2",
subcategory="CP:KEGG",
clean = TRUE)
hyp_kegg <- hypeR(gene_list$uniprot_gn_symbol,
KEGG,
test="hypergeometric",
background=50000,
fdr=0.05,
plotting=TRUE)
hyp_dots(hyp_kegg,
title="KEGG",
abrv=30,
val = "fdr")
Figure 16. Dot plot of the top enriched genesets from the KEGG
database. The color of the dot is indicative of its
significance, and its size is indicative of its geneset size.
Hallmark set summarized the information across multiple genesets, and reduced redundancy.
HALLMARK <- msigdb_gsets("Mus musculus",
"H",
"",
clean = TRUE)
# Hallmark
hyp_Hallmark <- hypeR(gene_list$uniprot_gn_symbol,
HALLMARK,
test="hypergeometric",
background=50000,
fdr=0.05,
plotting=TRUE)
#show top pathway
hyp_dots(hyp_Hallmark,
title="HALLMARK",
abrv=50,
val = "fdr")
Figure 17. Dot Plot based on the Hallmark database. The
color is indicative of the geneset significance, and its size is
indicative of the geneset size.
GOBP<- msigdb_gsets(species = "Mus musculus",
"C5",
"GO:BP",
clean = TRUE)
# GOBP
hyp_GOBP <- hypeR(gene_list$uniprot_gn_symbol,
GOBP,
test="hypergeometric",
background=50000,
fdr=0.05,
plotting=TRUE)
#show top pathway
hyp_dots(hyp_GOBP, title="GO: Biological Pathways", abrv=50, val = "fdr")
## Warning: Transformation introduced infinite values in continuous y-axis
Figure 18. Gene Ontology biological pathway
visualization. This is referring to the biological role
involving each gene. The color insicates the significance of the
biologicak pathway, and the size of the dot indicates the geneset
size.
up_degs <- subset(infected_n_sig_data, log2FoldChange > 0)
up_degs <- up_degs$uniprot_gn_symbol
up_degs <- na.omit(up_degs)
down_degs <- subset(infected_n_sig_data, log2FoldChange < 0)
down_degs <- down_degs$uniprot_gn_symbol
down_degs <- na.omit(down_degs)
hyp_KEGG_up <- hypeR(up_degs,
KEGG,
test="hypergeometric",
background=50000,
fdr=0.05,
plotting=TRUE)
#show top pathway
#plot_KEGG_up <-
hyp_dots(hyp_KEGG_up, title="KEGG Pathways Upregulated in infected samples", abrv=50, val = "fdr")
#ggsave("Dotplot_KEGG_upregulated.png")
Figure 19. Pathway analysis using only upregulated genes.
# KEGG_down
hyp_KEGG_down <- hypeR(down_degs,
KEGG,
test="hypergeometric",
background=50000,
fdr=0.05,
plotting=TRUE)
#plot_KEGG_down <-
hyp_dots(hyp_KEGG_down, title="KEGG Pathways Downregulated in infected samples", abrv=50, val = "fdr")
#ggsave("Dotplot_KEGG_downregulated.png")
Figure 20. Pathway analysis using only downregulated genes.
# THIS WHERE WE AT BUT IT'S BEDTIME
# TODO: Figure out how to get this to run
# TODO: Get the figures in their own folder(?)
sle_vs_n_gost <- gost(query = gene_list$uniprot_gn_symbol,
organism = "mmusculus",
sources = c("GO:BP", "KEGG", "REAC", "WP"),
significant = TRUE,
correction_method ="fdr",
domain_scope="annotated")
gostplot(sle_vs_n_gost, interactive = TRUE)
# IDK WHAT THIS IS FOR
REACTOME <- msigdb_gsets(species="Mus musculus",
category="C2",
subcategory="CP:REACTOME")